## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [conflicted] Will prefer dplyr::filter over any other package.
## [conflicted] Will prefer dplyr::select over any other package.
## [conflicted] Will prefer dplyr::slice over any other package.
## [conflicted] Will prefer dplyr::rename over any other package.
## [conflicted] Will prefer dplyr::intersect over any other package.
table with all measured Cq values
## Rows: 1560 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (11): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (15): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq = cq %>%
# qPCR_val is 'pass' for all circ that have at least one replicate > 10
mutate(qPCR_val = ifelse((Cq_min_untreated < 10) & (Cq_max_untreated < 10), 'fail', 'pass'),
qPCR_val = ifelse((is.na(Cq_min_untreated)) & (is.na(Cq_max_untreated)), 'fail', qPCR_val),
# qPCR_val_detail describes the reason why a circ fails this validation step if it does
qPCR_val_detail = ifelse((Cq_min_untreated < 10) & (Cq_max_untreated < 10), 'all_Cq_under_10', 'pass'),
qPCR_val_detail = ifelse(is.na(Cq_min_untreated) & is.na(Cq_max_untreated), 'all_NA', qPCR_val_detail))
cqstatistics
cq = cq %>%
# calculate delta cq
# first make temp 47 for NAs
mutate(Cq_min_treated_tmp = Cq_min_treated,
Cq_min_treated = ifelse(is.na(Cq_min_treated), 47, Cq_min_treated),
Cq_diff = Cq_min_treated - Cq_max_untreated) %>%
mutate(# fail if diff > 3
RR_val = ifelse(Cq_diff > 3, "fail", 'pass'),
RR_val_detail = ifelse(RR_val == 'fail', "FP (Cq_diff > 3)", 'pass'),
# when Cq untreated is high => put NA
RR_val = ifelse(Cq_min_untreated > 32, NA, RR_val),
RR_val_detail = ifelse(Cq_min_untreated > 32, 'out_of_range', RR_val_detail),
# if qPCR already failed => RR also becomes 'fail'
RR_val = ifelse(qPCR_val == "fail", 'fail', RR_val),
RR_val_detail = ifelse(qPCR_val == "fail", "fail_qPCR", RR_val_detail)) %>%
# clean up NAs for 47
mutate(Cq_diff = ifelse(Cq_min_treated == 47, NA, Cq_diff),
Cq_min_treated = Cq_min_treated_tmp) %>%
select(-Cq_min_treated_tmp) %>%
#' to clean up dataframe: also remove Cq_diff for 'all_Cq_under_10' group and out_of_range group
mutate(Cq_diff = ifelse(qPCR_val_detail == 'all_Cq_under_10', NA, Cq_diff),
Cq_diff = ifelse(RR_val_detail == 'out_of_range', NA, Cq_diff))
cqstatistics
## Rows: 1170 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): circ_id, cell_line, amp_seq_val, amp_seq_val_detail
## dbl (1): perc_on_target
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
statistics
add to cq dataframe
cq = cq %>%
left_join(cq_amp %>% unique()) %>%
mutate(amp_seq_val_detail = ifelse(is.na(amp_seq_val_detail), "not_included", amp_seq_val_detail))## Joining with `by = join_by(circ_id, cell_line)`
statistics
cq = cq %>%
mutate(RR_val_tmp = ifelse(RR_val_detail == 'out_of_range', 'fail', RR_val)) %>% # when circ cannot be measured for RR => count as fail
mutate(compound_val = ifelse(qPCR_val == "pass" & RR_val_tmp == "pass" & amp_seq_val == "pass", "pass", "NA"),
compound_val = ifelse(qPCR_val == 'fail' | RR_val_tmp == "fail" | amp_seq_val == 'fail', 'fail', compound_val),
compound_val = ifelse(is.na(amp_seq_val), NA, compound_val)) %>%
select(-RR_val_tmp)statistics
## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(chr, start, end, strand, circ_id, circ_id_strand,
## tool, cell_line, BSJ_count, count_group)`
perc_val = cq %>%
select(tool, circ_id, cell_line, count_group, qPCR_val, RR_val, RR_val_detail, amp_seq_val, amp_seq_val_detail, compound_val) %>%
group_by(tool, count_group) %>%
summarise(nr_qPCR_total = n(),
nr_qPCR_fail = sum(qPCR_val == 'fail'),
nr_qPCR_val = sum(qPCR_val == 'pass'),
nr_RR_total = n() - sum(is.na(RR_val)), # here NA are the ones that have are 'out_of_range'
nr_RR_fail = sum(RR_val == "fail", na.rm = T),
nr_RR_val = sum(RR_val == "pass", na.rm = T),
nr_amp_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_amp_fail = sum(amp_seq_val == "fail", na.rm = T),
nr_amp_val = sum(amp_seq_val == "pass", na.rm = T),
nr_compound_total = n() - sum(is.na(amp_seq_val)), # here NA are the ones 'not_included'
nr_compound_fail = sum(compound_val == "fail", na.rm = T),
nr_compound_val = sum(compound_val == "pass", na.rm = T)) %>%
mutate(perc_qPCR_val = nr_qPCR_val/nr_qPCR_total,
perc_RR_val = nr_RR_val/nr_RR_total,
perc_amp_val = nr_amp_val/nr_amp_total,
perc_compound_val = nr_compound_val/nr_compound_total) %>%
ungroup()## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
first calculate the total nr of validated circ per count group
nr_val = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
count(count_group_median) %>%
rename(nr_expected = n)
nr_valthen calculate sensitivity by dividing nr of circ found by total
sens = cq %>%
# get set of uniquely validated circRNAs
filter(compound_val == 'pass') %>%
select(circ_id, cell_line, count_group_median) %>% unique() %>%
# check witch tools have detected these
left_join(all_circ %>%
select(tool, circ_id, cell_line) %>% unique()) %>%
group_by(tool, count_group_median) %>%
summarise(nr_detected = n()) %>%
left_join(nr_val) %>%
mutate(sensitivity = nr_detected/nr_expected) %>%
ungroup()## Joining with `by = join_by(circ_id, cell_line)`
## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(count_group_median)`
sens_2 = perc_val %>%
# use perc_compound_val
select(tool, count_group, perc_compound_val) %>%
# get the number of detected circRNAs for each cell line and tool, per count group
left_join(all_circ %>%
group_by(tool, count_group) %>%
summarise(total_n = n())) %>%
# calculate the theoretical nr of validated circ
mutate(extrapolated_sensitivity = perc_compound_val * total_n) ## `summarise()` has grouped output by 'tool'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(tool, count_group)`
take both types of sensitivity together and fix Sailfish-circ count group thing
sens = sens %>%
filter(!tool == 'Sailfish-cir') %>%
full_join(sens_2 %>% filter(!tool == 'Sailfish-cir'),
by = c('count_group_median' = 'count_group', 'tool')) %>%
bind_rows(sens %>%
filter(tool == 'Sailfish-cir') %>%
full_join(sens_2 %>% filter(tool == 'Sailfish-cir') %>%
select(-count_group),
by = c('tool')))
sensval_sens_df = perc_val %>%
select(tool, count_group, perc_qPCR_val, perc_RR_val, perc_amp_val, perc_compound_val) %>%
full_join(sens %>% rename(count_group = count_group_median) %>%
select(-nr_detected, -nr_expected, -total_n) %>%
mutate(count_group = ifelse(tool == "Sailfish-cir", 'no_counts', count_group))) %>%
full_join(all_circ %>%
group_by(tool, count_group) %>% count()) %>%
group_by(count_group) %>%
mutate(qPCR_precision_rank = dense_rank(desc(perc_qPCR_val)),
RR_precision_rank = dense_rank(desc(perc_RR_val)),
amp_precision_rank = dense_rank(desc(perc_amp_val)),
compound_precision_rank = dense_rank(desc(perc_compound_val)),
sensitivity_rank = dense_rank(desc(sensitivity)),
nr_circ_rank = dense_rank(desc(n)),
extrapolated_sensitivity_rank = dense_rank(desc(extrapolated_sensitivity))) %>%
ungroup() %>%
filter(!is.na(n)) %>%
select(tool, count_group, n, nr_circ_rank, perc_qPCR_val, qPCR_precision_rank,
perc_RR_val, RR_precision_rank, perc_amp_val, amp_precision_rank,
perc_compound_val, compound_precision_rank, sensitivity, sensitivity_rank,
extrapolated_sensitivity, extrapolated_sensitivity_rank) %>%
arrange(desc(count_group), tool) %>%
rename(nr_circ_detected = n, qPCR_precision = perc_qPCR_val,
RR_precision = perc_RR_val, amp_precision = perc_amp_val,
compound_precision = perc_compound_val) %>%
mutate(across(c(qPCR_precision, RR_precision, amp_precision,
compound_precision, sensitivity), round, 4),
extrapolated_sensitivity = round(extrapolated_sensitivity, 0))## Joining with `by = join_by(tool, count_group, perc_compound_val)`
## Joining with `by = join_by(tool, count_group)`
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
calculate enrichment
## Rows: 1694840 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): chr, strand, cell_line, RNaseR, tool, circ_id, circ_id_strand, cou...
## dbl (4): start, end, BSJ_count, estim_len_in
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
treatment = all_circ %>%
rename(count_UT = BSJ_count) %>%
select(chr, start, end, strand, cell_line, tool, circ_id, circ_id_strand, count_UT, count_group) %>%
full_join(all_circ_treated %>%
rename(count_T = BSJ_count) %>%
select(chr, start, end, strand, cell_line, tool, circ_id, circ_id_strand, count_T))## Joining with `by = join_by(chr, start, end, strand, cell_line, tool, circ_id,
## circ_id_strand)`
treatment = treatment %>%
left_join(read_tsv('../data/details/nr_reads.txt') %>%
select(-RNA_id) %>%
pivot_wider(names_from = RNaseR, values_from = nr_reads) %>%
rename(nr_reads_T = treated, nr_reads_UT = untreated)) %>%
# calculate CPMs and enrichment factor (Sailfish-cir is already TPM)
mutate(cpm_T = ifelse(tool == 'Sailfish-cir', count_T, 1000000 * count_T/nr_reads_T),
cpm_UT = ifelse(tool == 'Sailfish-cir', count_UT, 1000000 * count_UT/nr_reads_UT),
enrichment = cpm_T/cpm_UT,
enrichment_bin = ifelse(enrichment > 1, 'enriched', 'not enriched'),
enrichment_bin = ifelse(is.na(count_T), 'count treated NA', enrichment_bin),
enrichment_bin = ifelse(is.na(count_UT), 'count untreated NA', enrichment_bin))## Rows: 6 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): cell_line, RNaseR, RNA_id
## dbl (1): nr_reads
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(cell_line)`
## 0% 25% 50% 75% 100%
## 0.9000 0.9875 0.9875 1.0000 1.0000
## 0% 25% 50% 75% 100%
## 0.80 0.90 0.95 1.00 1.00
## 0% 25% 50% 75% 100%
## 0.7402597 0.9491237 0.9625000 0.9808504 1.0000000
## 0% 25% 50% 75% 100%
## 0.5000000 0.8000000 0.8666667 0.9411765 1.0000000
## 0% 25% 50% 75% 100%
## 0.3000000 0.9373732 0.9552239 0.9844203 1.0000000
## 0% 25% 50% 75% 100%
## 0.1764706 0.6111111 0.7333333 0.8666667 0.9411765
## 0% 25% 50% 75% 100%
## 0.2714286 0.9055060 0.9310345 0.9538352 0.9830508
## 0% 25% 50% 75% 100%
## 0.05882353 0.53846154 0.63636364 0.76470588 0.88235294
sens %>%
filter(!count_group_median == 'count ≥ 5') %>%
filter(!tool == 'circRNA_finder',
!tool == 'segemehl') %>%
pull(sensitivity) %>%
quantile()## 0% 25% 50% 75% 100%
## 0.1923077 0.5494505 0.7170330 0.7898352 0.9395604
## 0% 25% 50% 75% 100%
## 0.3298566 0.7692308 0.8494133 0.9524120 0.9634941
## [1] 0.975
## [1] 0.939
## [1] 0.074
cq %>%
filter(count_group == 'count < 5') %>%
select(circ_id, cell_line, qPCR_val) %>% unique() %>%
count(qPCR_val)## [1] 0.935
nr of unique circ that pass all val methods
cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
unique() %>%
filter(qPCR_val == 'pass', RR_val == 'pass', amp_seq_val == 'pass')## [1] 0.631
nr of unique circ that fail all val methods
cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
unique() %>%
filter(qPCR_val == 'fail', RR_val == 'fail', amp_seq_val == 'fail')## [1] 0.012
nr of unique circ that fail 1 or 2 val methods
cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
filter(!is.na(RR_val), !is.na(amp_seq_val)) %>%
unique() %>%
mutate(val_nr = str_count(paste(qPCR_val, RR_val, amp_seq_val),
pattern = 'pass')) %>%
#filter(val_nr == 0)
#filter(val_nr == 3)
filter(val_nr < 3, val_nr > 0)## [1] 0.084
nr of cicrc that have at least one NA
cq %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
unique() %>%
mutate(NA_nr = str_count(paste(qPCR_val, RR_val, amp_seq_val),
pattern = 'NA')) %>%
#filter(NA_nr == 0)
#filter(NA_nr == 3)
filter(NA_nr > 0)## [1] 0.273
nr of unique circ that have no NAs
cq %>%
filter(!is.na(RR_val), !is.na(amp_seq_val)) %>%
select(circ_id, qPCR_val, RR_val, amp_seq_val, cell_line) %>%
unique()=> recalculate previous numbers based on no NAs all pass
## [1] 0.867
all fail
## [1] 0.016
one or two fail
## [1] 0.116